Harvesting Web 2.0: An Introduction to API Processes in R
MAJ Ross Schuchard
ARCYBER
20 May 2016
twitteR PackagegtrendsR)
SOURCE:18 May 2016
R serves as an ideal platform for using APIs given the vast array of R supported API wrapper packages, plus the ability to ‘play’ with the resulting acquired data in an R environmenttwitteRtwitteR package is an API wrapper that serves as an interface to the Twitter Web APItwitteR CRAN documentation
twitteR package#Install package if you do not have in your package repository
#install.packages("twitteR") #uncomment if you require installation
#Call twitteR package
library(twitteR)
#Create access credential variables (change to your specific keys and tokens)
api_key <- YOURapi_key #include parentheses around keys and tokens
api_secret <- YOURapi_secret
token <- YOURtoken
token_secret <- YOURtoken_secret
# Create Twitter Connection
setup_twitter_oauth(api_key, api_secret, token, token_secret) #Answer 'YES' to create connection## [1] "Using direct authentication"
twitteR)#Conduct search by keyword and limit to 100 responses
egypt <- searchTwitter("egyptair", n=100)
#Transform returned tweets into a dataframe structure
egypt.df <- twListToDF(egypt)
#View metadata column fields of dataframe
names(egypt.df)## [1] "text" "favorited" "favoriteCount" "replyToSN"
## [5] "created" "truncated" "replyToSID" "id"
## [9] "replyToUID" "statusSource" "screenName" "retweetCount"
## [13] "isRetweet" "retweeted" "longitude" "latitude"
#Conduct search by keyword and limit to 1000 responses
egyptISIS <- searchTwitter("egyptair + ISIS", n=1000)
#Transform returned tweets into a dataframe structure
egyptISIS.df <- twListToDF(egyptISIS)
#Convert harvested tweet dataframe to a .csv file for later use
write.csv(egyptISIS.df, "EgyptAirTweets.csv", row.names=FALSE)library(RCurl)
library(RJSONIO)
library(stringr)
library(tm)
library(igraph)
library(RColorBrewer)
library(httr)
library(wordcloud)
# Get text data from the result of Twitter search
text1 <- sapply(egyptISIS, function(x) x$getText())
#Prepare text through extensive regex operations to normalize to substantive words
#Remove retweets to avoid duplicative text
text1 = gsub("(RT|via)((?:\\b\\W*@\\w+)+)", "", text1)
#Remove mentions from text corpus (i.e. @somebody)
text1 = gsub("@\\w+", "", text1)
#Remove punctuation
text1 = gsub("[[:punct:]]", "", text1)
#Remove numbers
text1 = gsub("[[:digit:]]", "", text1)
#Remove html links
text1 = gsub("http\\w+", "", text1)
#Remove unnecessary whitespace
text1 = gsub("[ \t]{2,}", "", text1)
text1 = gsub("^\\s+|\\s+$", "", text1)
#Implementation of function to convert case to lower
tryTolower = function(x)
{
# create missing value
y = NA
# tryCatch error
try_error = tryCatch(tolower(x), error=function(e) e)
# if not an error
if (!inherits(try_error, "error"))
y = tolower(x)
# result
return(y)
}
# lower case using tryTolower with sapply
text1 = sapply(text1, tryTolower)
#Establish corpus of words
text1_corpus <- Corpus(VectorSource(text1))
#Place in matrix format
tdm =TermDocumentMatrix(
text1_corpus,
control = list(
removePunctuation = TRUE,
stopwords = c(stopwords("en")),
removeNumbers = TRUE,
tolower = TRUE)
)
m = as.matrix(tdm)
#Establish word counts in decreasing order
word_freqs = sort(rowSums(m), decreasing = TRUE)
#Create a data frame with words and their frequencies
dm = data.frame(word = names(word_freqs), freq = word_freqs)
#Declare plot name
plotfile1 <- "EgyptISIS_wordcloud.png"
#Force creation of word cloud
wordcloud(dm$word, dm$freq, min.freq=2, random.order = FALSE,
colors = brewer.pal(12, "Dark2"))#Final characteristics of cloud figure file
png(filename=plotfile1, width=740, height=740, units="px")
#NOTE: Natural language and word cloud code adapted
#from http://davetang.org/muse/2013/04/06/using-the-r_twitter-package/#Limit our search to a specific geographic area to assume confidence in location of those tweets
#Search for last 10000 tweets limited to 50 mile radius around Paris
egyptParis <- searchTwitter("egyptair", n=10000, geocode='48.8647,2.3490,50mi')## [1] "Rate limited .... blocking for a minute and retrying up to 119 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 118 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 117 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 116 times ..."
## [1] "Rate limited .... blocking for a minute and retrying up to 115 times ..."
#Transform returned tweets into a dataframe structure
egyptParis.df <- twListToDF(egyptParis)
#Convert harvested tweet dataframe to a .csv file for later use
write.csv(egyptParis.df, "EgyptParisTweets.csv", row.names=FALSE)#Use mapping package leaflet to view geolocation results for entire dataframe
library(leaflet)
paris <- read.csv("EgyptParisTweets.csv")
names(paris)## [1] "text" "favorited" "favoriteCount" "replyToSN"
## [5] "created" "truncated" "replyToSID" "id"
## [9] "replyToUID" "statusSource" "screenName" "retweetCount"
## [13] "isRetweet" "retweeted" "longitude" "latitude"
#Subset dataframe to only those tweets with coordinates
egyptCoords <- paris[!is.na(paris[,15]),]
#Declare map focused on Europe continent, with Paris as the centroid
m <- leaflet() %>% setView(lng = 2.3490, lat = 48.8647, zoom = 10)
#Add specific map type and reference data frame for coordinates to plot
m %>% addProviderTiles("CartoDB.Positron") %>% addCircles(data = egyptCoords, lat = ~ latitude, lng = ~ longitude)#Access to GoogleTrends through `gtrendsR` package
#Install package if you do not already have it
#install.packages("gtrendsR")
#Declare package
library(gtrendsR)
#Create variables for your username/password authentication (MAKE SURE TO INPUT YOUR CREDENTIALS)
user <- Youruser #Insert your google id (i.e. 'joe.smith@gmail.com')
password <- Yourpassword #Insert your google password (i.e. 'password123')
#Establish connection to google
gconnect(user, password)
#Explore a trend
hotJobs <- gtrends(c("data science", "operations research"))
#Plot results
plot(hotJobs)#Take a look at specific regions (Germany and the United States)
location = c("DE","US")
#Keyword query
query = c("Donald Trump")
#Submit API request with explicit date constraints
trump = gtrends(query, location, start_date = "2015-01-01", end_date = "2016-05-18")
#Provide plot of results
plot(trump)httrSOURCE:London Dashboard